output: html_document: default pdf_document: default —

library(MASS)
library(tidyverse)
library(kableExtra)
library(latex2exp)
library(gplots)
library(pROC)
library(GGally)
library(knitr)

1) Linear regression fundamentals

set.seed(1984)
n <- 100
x1 <- rnorm(n = n, mean = 10, sd = sqrt(10))
x2 <- rnorm(n = n, mean = 5, sd = sqrt(5))
y1 <- 1.6 + 0 * x1 - 2 * x2 + rnorm(n = n, mean = 0, sd = 3)
simulatedata=data.frame(x1,x2,y1)

Although we simulated x1 and x2, assume that they are deterministic quantities. (a) Write the equation of a normal linear model which has x1 and x2 as predictors, in terms of βs, σs, and ε along with the distributional assumptions of\(\epsilon\). What are the true parameter values of β0, β1, β2, and σ? Are the true parameter values fixed (non-random) or random quantities?

\[y=\beta_0+\beta_1 x_1+\beta_2 x_2+\epsilon \] where \(\epsilon \sim N(\mu,\sigma)\) and \(\beta_0=1.6\), \(\beta_1=0\), \(\beta_1=-2\),\(\mu=0, \ \sigma=3\).

simulation of alternative versions

y2 <- 1.6+0*x1-2*x2+ rcauchy(n = n, location = 0, scale = 3)
y3 <- 1.6+0*x1-2*x2+ rnorm(n = n, mean = 0, sd = 1:n*3)
  1. Plot the density of y1,y2, and y3. What assumption of normal linear regression are we violating if we use y2 as the dependent variable? What assumption of normal linear regression are we violating if we use y3 as the dependent variable?
par(mfrow=c(1,3))
plot(density(y1))
plot(density(y2))
plot(density(y3))

In \(y2\) Cauchy distribution is used with un known variance. In \(y3\) has normal error with not fixed variance. So in the both \(y2\) and \(y3\) will not made a normal model.

  1. Fit a linear regression model where y1 is the dependent variable and x1, and x2 are the predictors (in order). What are the coefficient estimates of \(\beta_0\), \(\beta_1\),\(\beta_2\) and \(\sigma\)? What are the R2 and R2adj values of the fit? Write out the equation of the model with the estimated parameter values.
fit.simple=lm(y1~x1+x2, data = simulatedata)
S=summary(fit.simple)
S
## 
## Call:
## lm(formula = y1 ~ x1 + x2, data = simulatedata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8036 -1.9811 -0.3691  2.0812  8.2220 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.71237    1.16542  -1.469    0.145    
## x1           0.20195    0.09703   2.081    0.040 *  
## x2          -1.82409    0.13615 -13.397   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.862 on 97 degrees of freedom
## Multiple R-squared:  0.6501, Adjusted R-squared:  0.6429 
## F-statistic: 90.12 on 2 and 97 DF,  p-value: < 2.2e-16
fit.simple$coef 
## (Intercept)          x1          x2 
##  -1.7123651   0.2019501  -1.8240887
S$sigma # sigma 
## [1] 2.862193
R2=S$r.squared
R2
## [1] 0.6501148
R2ad=S$adj.r.squared
R2ad
## [1] 0.6429007
S$sigma
## [1] 2.862193

\[y=\beta_0+\beta_1 x_1+\beta_2 x_2+\epsilon\] where \(\beta_0=-1.71237\), \(\beta_1=0.20195\) and \(\beta_2=-1.82409\) and \(\epsilon \sim N(0,2.862193)\).

  1. What is the p-value for the hypothesis test that \(\beta_1\)= 0? Do we reject the null hypothesis under the significance level of \(\alpha\)= 0.05? Write out the 95% confidence interval for all \(\beta\) parameter estimates using the confint function.
p_val=S$coefficients[2,4]
p_val # since p_value is significant and less than .05 we reject the null hypothesis 
## [1] 0.04004259
confint(fit.simple)
##                    2.5 %     97.5 %
## (Intercept) -4.025395861  0.6006656
## x1           0.009368959  0.3945312
## x2          -2.094316008 -1.5538615
  1. Use the linear model to give a prediction for an observation with the following co-variate values:x1 = 12; x2 = 7. State the prediction along with its 95% prediction interval.
newData <- data.frame(x1=12, x2=7)
modelPrediction <- predict(fit.simple, newData, interval = 'predict')
modelPrediction
##         fit       lwr       upr
## 1 -12.05759 -17.80471 -6.310463
  1. In class we discussed that the distribution of our \(\beta\) coefficient estimates was normal. Demonstrate this by running 25000 simulations where you treat y1 as the outcome of some experiment. For each iteration of the experiment you should store the estimates of \(\beta_0\), \(\beta_1\) and \(\beta_2\). At the end of the simulation, plot the densities of \(\beta_i\),i= 1,2,3. Add a red, broken vertical line to each density plot which corresponds to the true parameter value. Discuss what you see on the density plots.[Hint: recompute y1 at the beginning of each iteration.]
L <- 25000 # number of simulations
df <- data.frame(iteration = 1:L, beta0 = 0, beta1 = 0, beta2=0 ,sigma =0)
# We will use the x vector we created at the beginning of this document but we'll 
# recalculate y for each experiment with a new draw from the error distribution
for (i in 1:L) {
  ySim <- 1.6 + 0 * x1 - 2 * x2 + rnorm(n = n, mean = 0, sd = 3)
  # Fit model and store result in data frame
  l1 <- lm(ySim ~ x1+x2)
  df$beta0[i] <- coef(l1)[1]
  df$beta1[i] <- coef(l1)[2]
  df$beta2[i]<-coef(l1)[3]
  df$sigma[i] <- summary(l1)$sigma
}
df %>%
  gather(var, val, -iteration) %>%
  ggplot(aes(x = val)) +
  geom_density() +
  facet_wrap(~var, scales = 'free') +
  geom_vline(data = tibble(val = apply(df[, 2:4], 2, mean), 
                           var = colnames(df[, 2:4])), 
             aes(xintercept = val), lty = 2, col = 'red')

We can see that coefficients and sigma are normally distributed and they have mean which we expected.

2) Multiple linear regression

  1. Begin by loading the data into R. Recast the season variable as a factor variable and rename the levels such that 1 = spring; 2 = summer; 3 = fall; 4 = winter. Make sure that spring is the baseline level of the new factor variable.
bikesharing<-read.csv('bikesharing.csv', sep=';')
bikesharing$season=factor(bikesharing$season, labels = c('spring','summer','fall','winter'))
is.factor(bikesharing$season)
## [1] TRUE
  1. Perform an exploratory analysis on variables registered, season, temp, hum,wind speed. For the numerical variables, create a table which has the min, max,median, 2.5% quantile, 97.5% quantile, mean, and standard deviation. Determine the correlation matrix between the continuous variables and plot it using a heatmap.Discuss the table and the heatmap.
stats=bikesharing[,c('registered', 'temp', 'hum','windspeed')]
number.na <- apply(stats, 2, function(x) sum(is.na(x)))
means <- round(apply(stats, 2, mean),3)
stds <- round(apply(stats, 2, sd),3)
quantiles <- round(apply(stats, 2, quantile),3)

table.data.1 <- as.data.frame(t(as.matrix(as.data.frame(rbind(number.na, means, stds, quantiles)))))
colnames(table.data.1) <- c('NA values','Mean', 'Std','Minimum', '25% quantile', 'Median', '75% quantile', 'Maximum')
table.data.1 %>%
  kbl() %>%
  kable_styling(full_width = F)
NA values Mean Std Minimum 25% quantile Median 75% quantile Maximum
registered 0 3656.172 1560.256 20.000 2497.000 3662.000 4776.500 6946.000
temp 0 0.495 0.183 0.059 0.337 0.498 0.655 0.862
hum 0 0.628 0.142 0.000 0.520 0.627 0.730 0.973
windspeed 0 0.190 0.077 0.022 0.135 0.181 0.233 0.507
drop=c(1,4)
heatmap.2(cor(bikesharing[,-drop]))

  1. For the categorical variable season, plot box-plots with season on the x-axis and registered on the y-axis. Describe what you see on the box-plots.
ggplot(bikesharing, aes(x=season, y=registered)) + 
    geom_boxplot(fill="slateblue", alpha=0.2) + 
    xlab("season")

As we expect we can see when the weather becomes warmer people like to use bike more than cold season. In sprint and summer there is an increasing in the median of registration however we have opposite trend in fall and winter.

  1. Fit a linear regression model where the dependent variable is registered and the independent variable is season. Display the summary and interpret the coefficients. Look at the F-statistic; can we reject the ANOVA null hypothesis?
fitseas=lm(registered~season, data = bikesharing)
S1=summary(fitseas)
S1
## 
## Call:
## lm(formula = registered ~ season, data = bikesharing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4136.5 -1042.0  -181.1  1111.3  3576.4 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3528.1      100.9  34.961  < 2e-16 ***
## seasonsummer    919.6      142.7   6.444 2.12e-10 ***
## seasonfall      628.4      143.1   4.391 1.29e-05 ***
## seasonwinter  -1049.5      143.3  -7.324 6.42e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1369 on 727 degrees of freedom
## Multiple R-squared:  0.2335, Adjusted R-squared:  0.2303 
## F-statistic:  73.8 on 3 and 727 DF,  p-value: < 2.2e-16
S1$fstatistic
##     value     numdf     dendf 
##  73.80183   3.00000 727.00000
anova(fitseas)
## Analysis of Variance Table
## 
## Response: registered
##            Df     Sum Sq   Mean Sq F value    Pr(>F)    
## season      3  414867209 138289070  73.802 < 2.2e-16 ***
## Residuals 727 1362244763   1873789                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(fitseas))
##              Df    Sum Sq   Mean Sq F value Pr(>F)    
## season        3 4.149e+08 138289070    73.8 <2e-16 ***
## Residuals   727 1.362e+09   1873789                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As p_value is <2e-16 we can reject the null hypothesis.

  1. Use Tukey’s test (TukeyHSD()) to compare the number of registered bikers between every season to each other. Report which differences are significant (\(\alpha\)= 0.05) and interpret the results.
TukeyHSD(aov(fitseas))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = fitseas)
## 
## $season
##                     diff        lwr        upr     p adj
## summer-spring   919.6359   552.1499  1287.1219 0.0000000
## fall-spring     628.4351   259.9409   996.9293 0.0000762
## winter-spring -1049.5123 -1418.5179  -680.5067 0.0000000
## fall-summer    -291.2008  -659.6950    77.2934 0.1761548
## winter-summer -1969.1482 -2338.1537 -1600.1426 0.0000000
## winter-fall   -1677.9474 -2047.9570 -1307.9377 0.0000000
TukeyHSD(aov(fitseas))$season %>%
  as_tibble(rownames = 'Comparison') %>%
  rename(p.adj = 5) %>%
  filter(p.adj < 0.05) %>%
  kbl(align = 'c') %>%
  kable_styling()
Comparison diff lwr upr p.adj
summer-spring 919.6359 552.1499 1287.1219 0.00e+00
fall-spring 628.4351 259.9409 996.9293 7.62e-05
winter-spring -1049.5123 -1418.5179 -680.5067 0.00e+00
winter-summer -1969.1482 -2338.1537 -1600.1426 0.00e+00
winter-fall -1677.9474 -2047.9570 -1307.9377 0.00e+00

We see that the p-value for each comparison is less than 0.05 and thus there exists a significant difference between every group. The first line states that the difference between the summer and spring is 919.6359, with a 95% confidence interval of 552.1499 and 1287.1219; the difference is significant. similarly for other lines.

plot(TukeyHSD(aov(fitseas)))

Each pair there is significant different except maybe fall-summer.

  1. Extend the linear model from item (d) by adding variables temp, hum, windspeed. State the parameter estimates along with the estimate for \(\sigma\). State the R2value. Write out the equation of the model with the estimated parameter values.
fit2=lm(registered~season+temp+hum+windspeed, data = bikesharing)
Sfit2=summary(fit2)
Sfit2
## 
## Call:
## lm(formula = registered ~ season + temp + hum + windspeed, data = bikesharing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4077.3  -942.2  -130.6   979.1  2812.0 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3452.35     334.46  10.322  < 2e-16 ***
## seasonsummer  -547.61     166.30  -3.293  0.00104 ** 
## seasonfall     627.88     128.94   4.869 1.38e-06 ***
## seasonwinter   -45.27     155.61  -0.291  0.77120    
## temp          5517.23     456.05  12.098  < 2e-16 ***
## hum          -2945.36     340.56  -8.649  < 2e-16 ***
## windspeed    -3607.80     609.49  -5.919 4.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1208 on 724 degrees of freedom
## Multiple R-squared:  0.4057, Adjusted R-squared:  0.4008 
## F-statistic: 82.38 on 6 and 724 DF,  p-value: < 2.2e-16
Sfit2$sigma
## [1] 1207.764
Sfit2$r.squared
## [1] 0.4057242

The following dummy variables \(d_i\) respect to season (to avoid writing 4 equation) then \[ \hat{y}=\beta_0+\beta_1 d_{summer}+ \beta_2 d_{fall}+\beta_3 d_{winter}+\beta_4 x_1+\beta_5 x_2+\beta_6 x_3\]

  1. Store the R2adj, number of predictors, AIC, and RMSE of the model from item (d)and item (f) in a table. Compare the two models by discussing the table.
models <- c(1,2)
R.adj <- rep(0,2)
aic <- rep(0,2)
RMSE<- rep(0,2)
Nub.Predictors=rep(0,2)
R.adj[1]=S1$adj.r.squared
R.adj[2]=Sfit2$adj.r.squared
Nub.Predictors[1]=length(fitseas$coefficients)
Nub.Predictors[2]=length(fit2$coefficients)
aic[1]=AIC(fitseas)
aic[2]=AIC(fit2)
RMSE[1]=sqrt(mean((bikesharing$registered - predict(fitseas))^2))
RMSE[2]=sqrt(mean((bikesharing$registered - predict(fit2))^2))
table.data.3 <- as.data.frame(cbind(models,Nub.Predictors ,R.adj, aic, RMSE))
colnames(table.data.3) <- c('Model','Number.Predictors' , '$R_{\\text{adj}}^2$', 'AIC', '$\\text{RMSE}$')

table.data.3 %>%
  kbl() %>%
  kable_styling(full_width = F)
Model Number.Predictors \(R_{\text{adj}}^2\) AIC \(\text{RMSE}\)
1 4 0.2302870 12638.66 1365.114
2 7 0.4007993 12458.58 1201.967

Model in f has lower AIC.

  1. Check for high-leverage points, outliers and influential points for the model from item (f). Display and discuss the main diagnostics plots for high-leverage points, outliers and influential points. Perform a statistics test for potential outliers and discuss the results.
fort.2 <- fortify(fit2)
fort.2$jackknife <- rstudent(fit2)
fort.2$rn <- row.names(fort.2)
n <- nrow(fort.2)
p <- ncol(model.matrix(fit2))
fort.2$obsN <- row.names(fort.2)
fort.2$index <- 1:nrow(fort.2)
fort.2 %>%
  ggplot(aes(x = index, y = .hat)) +
  geom_point() +
  geom_hline(yintercept = 2*p/n, lty = 2, col = 'red') +
  geom_text(aes(label = ifelse(.hat > 2*p/n, obsN, '')), hjust = -0.5)

We see that observation 50 , 69 has the highest leverage but there are other potential high leverage points such as observation 302, 45, and, 65, some more. see the following table:

fort.2 %>%
  filter(.hat > 2*p/n) %>%
  arrange(desc(.hat)) %>%
  select(obsN, .hat) %>%
  kbl(align = 'c') %>%
  kable_styling()
obsN .hat
50 0.0404726
69 0.0330918
302 0.0285446
239 0.0241487
65 0.0240762
45 0.0232324
90 0.0224346
627 0.0209779
694 0.0208881
293 0.0204762
668 0.0203174
62 0.0198301
fort.2 %>%
  dplyr::select(.stdresid, jackknife, rn) %>%
  gather(residual, value, -rn) %>%
  mutate(residual = factor(residual, 
                           levels = c('.stdresid', 'jackknife'))) %>%
  ggplot(aes(x = rn, y = value)) + 
  geom_point(size = 1) +
  geom_hline(yintercept = 0, lty = 2, col = 'red') +
   geom_hline(yintercept = 2.5, lty = 2, col = 'green') +
  geom_hline(yintercept = -2.5, lty = 2, col = 'green') +
  geom_hline(yintercept = 3, lty = 2, col = 'red') +
  geom_hline(yintercept = -3, lty = 2, col = 'red') +
  geom_text(aes(label = ifelse(abs(value) > 3, rn, '')), vjust = -0.5) +
  xlab("Index") +
  ylab("Residuals") +
  ggtitle("Index plot of standardized and jackknife residuals") +
  theme(plot.title = element_text(hjust = 0.5, size = 15)) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) + 
  facet_wrap(~residual, scales = 'free')

From the studentized and jackknife residual points we see that there are observations exceed the (heuristic) threshold of 2. Let’s confirm our graphical exploration by comparing our jackknife residual to the theoretical t value we obtain after Bonferroni corrections.

p = length(coef(fit2))
n = length(fitted(fit2))
fort.2$obsN <- row.names(fort.2)
alpha <- 0.05
theoreticalT <- qt(p = 1 - alpha/(2 * n), n - p - 1)
theoreticalNoBon <- qt(p = 1 - alpha/2, n - p - 1)
tab <- tibble(jackknife = abs(fort.2$jackknife[abs(fort.2$jackknife) > 3]),
       theoreticalT  = theoreticalT ,
       OutlierBon = jackknife > theoreticalT ,
       theoreticalNoBon = theoreticalNoBon,
       OutlierNoBon = jackknife > theoreticalNoBon)  %>%
  kbl(align = 'c') %>%
  kable_styling(full_width = F)
tab
jackknife theoreticalT OutlierBon theoreticalNoBon OutlierNoBon
3.459055 4.005112 FALSE 1.963251 TRUE
fort.2 %>%
  ggplot(aes(x = index, y = .cooksd)) +
  geom_point() +
  geom_hline(yintercept = 0.05, lty = 2) +
  geom_text(aes(label = ifelse(.cooksd > 0.05, obsN, '')), hjust = -0.5)

suspicious <- c(50,69,302,239,45,65, 293,62,668,694,627)
fort.2 %>%
  filter(obsN %in% suspicious) %>%
  mutate(highLeverage = .hat > 2*p/n,
         outlier = abs(jackknife) > theoreticalT,
         influential = .cooksd > 0.05) %>%
  select(obsN, highLeverage, outlier, influential) %>%
  mutate(totalMarks = highLeverage + outlier + influential) %>%
  kbl(align = 'c') %>%
  kable_styling()
obsN highLeverage outlier influential totalMarks
45 TRUE FALSE FALSE 1
50 TRUE FALSE FALSE 1
62 TRUE FALSE FALSE 1
65 TRUE FALSE FALSE 1
69 TRUE FALSE TRUE 2
239 TRUE FALSE FALSE 1
293 TRUE FALSE FALSE 1
302 TRUE FALSE FALSE 1
627 TRUE FALSE FALSE 1
668 TRUE FALSE FALSE 1
694 TRUE FALSE FALSE 1
indexes <- c(50,69,302,239,45,65, 293,62,668,694,627)
r.adj <- rep(0,11)
for(i in 1:11){
bikesharing.update <- bikesharing[-indexes[i],]
fit2.update<-lm(registered~season+temp+hum+windspeed, data = bikesharing.update )
r.adj[i] = summary(fit2.update)$adj.r.squared
}

table.data.2 <- as.data.frame(cbind(indexes, r.adj))
colnames(table.data.2) <- c('Index', '$R_{\\text{adj}}^2$')

table.data.2 %>%
  kbl() %>%
  kable_styling(full_width = F)
Index \(R_{\text{adj}}^2\)
50 0.4012560
69 0.4073814
302 0.3982012
239 0.4003234
45 0.4006081
65 0.3979838
293 0.4008911
62 0.4001896
668 0.4002501
694 0.4005677
627 0.4007928

The computed R2adj in our first model is 0.4007. The result from the table and a more closer examination of the models we created in the for loop indicate that removal of these points is indeed useful.

  1. Assess the assumptions of the model from item (f) by looking at the residual plots,the fitted-versus-residuals plot, and the QQ-plot. Discuss what you see.
fort.2 %>%
  dplyr::select(.resid, rn) %>%
  gather(residual, value, -rn) %>%
  mutate(residual = factor(residual, 
                           levels = '.resid')) %>%
  ggplot(aes(x = rn, y = value)) + 
  geom_point(size = 1) +
  geom_hline(yintercept = 0, lty = 2, col = 'red') +
  geom_hline(yintercept = 2750, lty = 2, col = 'red') +
  geom_hline(yintercept = -2750, lty = 2, col = 'red') +
  geom_text(aes(label = ifelse(abs(value) >2750, rn, '')), vjust = -0.5) +
  xlab("Index") + 
  ylab("Residuals") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) + 
  ggtitle("Index plot of residuals") +
  theme(plot.title = element_text(hjust = 0.5, size = 15))

There seems to be a raised trend in our residuals.

tibble(fort.2$.stdresid) %>%
  gather(type, val) %>%
  ggplot(aes(sample = val)) +
  stat_qq(size = 1) +
  geom_abline(slope = 1, intercept = 0, lty = 2, col = 'blue') +
  xlab("Theoretical value") + 
  ylab("Sample value") +
  ggtitle("QQ-plot of the residuals") + 
  theme(plot.title = element_text(hjust = 0.5, size = 15))

We see evidence of non-normality. This is the effect of some points on this model.

tibble(yFitted = fort.2$.fitted, residual = fort.2$.resid, rn = fort.2$rn) %>%
  ggplot(aes(x = yFitted, y = residual)) + 
  geom_point(size = 1) + 
  geom_hline(yintercept = 0, lty = 2, col = 'red') +
  geom_smooth(method = 'loess') +
  xlab("Fitted value") +
  ylab("Residual") +
  ggtitle("Residual vs. fitted value") +
  theme(plot.title = element_text(hjust = 0.5, size = 15))

(j)Based on the results from items (g), (h) and (i) suggest improvements to the model.

the variable seasonwinter is not significant we have to consider the model that not consider season winter as a variables. this could help to improve the model.

To summarize our findings. There are some deviations from the normal assumptions, there seems to be heteroskedasticity, there are some some points seems to be outliers and influential points. We can potentially improve our model by applying transformations.

Multiple linear regression continued

We continue with the bikesharing.csv data and the model you fitted in the previous section.

  1. What power transform does the boxcox() function suggest could improve the model from the previous section? That is, which value of \(\lambda\) maximizes the log-likelihood?
fit2=lm(registered~season+temp+hum+windspeed, data = bikesharing)
bc<-boxcox(fit2)

lambda <- bc$x[which.max(bc$y)]
lambda
## [1] 0.7070707
  1. Add the log transform of registered and the Box-Cox transform of registered to your bikehsaring data set. Plot the density of the number registered bikers as well as its log transformation and boxcox() transformation. Discuss what you see.
bikesharing$log_registered<-log(bikesharing$registered)
bikesharing$T_lambda_registered<-((bikesharing$registered)^(lambda)-1)/lambda

plot(density(bikesharing$registered))

plot(density(bikesharing$log_registered))

plot(density(bikesharing$T_lambda_registered))

It seams that Transformation with boxcox and without transformation is better than log transformation.

  1. Fit a new model where registered has been replaced with its log transform. You should include the variables temp, hum, wind speed as independent variables. You should also add new variables to the model from the data set. For each variable you choose not to include, explain why you are omitting that variable. For each variable you choose to include, explain why you are including that variable. You can include interaction effects.

I add workingday, holiday, variable casual. Others are similar or has small variation.

fitlog=lm(log_registered~season+temp+hum+windspeed+workingday+holiday+casual, data = bikesharing)
Sl=summary(fitlog)
Sl
## 
## Call:
## lm(formula = log_registered ~ season + temp + hum + windspeed + 
##     workingday + holiday + casual, data = bikesharing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7004 -0.1925  0.0275  0.2418  0.8553 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.351e+00  1.205e-01  61.012  < 2e-16 ***
## seasonsummer -1.097e-01  5.466e-02  -2.007   0.0451 *  
## seasonfall    2.099e-01  4.143e-02   5.065 5.20e-07 ***
## seasonwinter  1.007e-01  5.136e-02   1.960   0.0504 .  
## temp          1.099e+00  1.667e-01   6.594 8.29e-11 ***
## hum          -7.173e-01  1.148e-01  -6.248 7.09e-10 ***
## windspeed    -9.029e-01  2.014e-01  -4.484 8.54e-06 ***
## workingday    6.059e-01  4.427e-02  13.688  < 2e-16 ***
## holiday      -7.905e-02  8.961e-02  -0.882   0.3780    
## casual        4.084e-04  3.678e-05  11.104  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3864 on 721 degrees of freedom
## Multiple R-squared:  0.5387, Adjusted R-squared:  0.5329 
## F-statistic: 93.54 on 9 and 721 DF,  p-value: < 2.2e-16
  1. State the estimates for the parameters as well as \(\sigma\) for your new model. State which of the predictors are significant at \(\alpha\)= 0.05.
Sl$sigma
## [1] 0.3863846
Sl$adj.r.squared
## [1] 0.5329133
Sl$r.squared
## [1] 0.5386719

All the variables except holiday and seasonwinter all variables are significant.

  1. Add the number of parameters, R2adj, AIC, and RMSE (on the relevant scale) of the model you created in (c) to the table you created in part II (g). Display the full table and discuss what you see; is your new model an improvement?
models <- c(1,2,3)
R.adj <- rep(0,3)
aic <- rep(0,3)
RMSE<- rep(0,3)
R.adj[1]=S1$adj.r.squared
R.adj[2]=Sfit2$adj.r.squared
R.adj[3]=Sl$adj.r.squared
aic[1]=AIC(fitseas)
aic[2]=AIC(fit2)
aic[3]=AIC(fitlog)
RMSE[1]=sqrt(mean((bikesharing$registered - predict(fitseas))^2))
RMSE[2]=sqrt(mean((bikesharing$registered - predict(fit2))^2))
RMSE[3]=sqrt(mean((bikesharing$registered - exp(predict(fitlog)))^2))
N.predictors=rep(0,3)
N.predictors[1]=length(fitseas$coefficients)
N.predictors[2]=length(fit2$coefficients)
N.predictors[3]=length(fitlog$coefficients)
table.data.3 <- as.data.frame(cbind(models,N.predictors ,R.adj, aic, RMSE))
colnames(table.data.3) <- c('Model','Nub.var' ,'$R_{\\text{adj}}^2$', 'AIC', '$\\text{RMSE}$')

table.data.3 %>%
  kbl() %>%
  kable_styling(full_width = F)
Model Nub.var \(R_{\text{adj}}^2\) AIC \(\text{RMSE}\)
1 4 0.2302870 12638.656 1365.114
2 7 0.4007993 12458.576 1201.967
3 10 0.5329133 696.171 1056.917

AIC and RMSE of model 3 is very smaller than others. Model 3 is better than one and two.

  1. Check for high-leverage points, outliers and influential points for the model from(c) Display and discuss the main diagnostics plots for high-leverage points, outliers and influential points. Perform a statistics test for potential outliers and discuss the results.
fort.3 <- fortify(fitlog)
fort.3$.jackknife <- rstudent(fitlog)
fort.3$rn <- row.names(fort.3)
fort.3 %>%
  dplyr::select(.stdresid, .jackknife, rn) %>%
  gather(residual, value, -rn) %>%
  mutate(residual = factor(residual, 
                           levels = c('.stdresid', '.jackknife'))) %>%
  ggplot(aes(x = rn, y = value)) + 
  geom_point(size = 1) +
  geom_hline(yintercept = 0, lty = 2, col = 'red') +
  geom_hline(yintercept = 3, lty = 2, col = 'red') +
  geom_hline(yintercept = -3, lty = 2, col = 'red') +
  geom_text(aes(label = ifelse(abs(value) > 3, rn, '')), vjust = -0.5) +
  xlab("Index") +
  ylab("Residuals") +
  ggtitle("Index plot of standardized and jackknife residuals") +
  theme(plot.title = element_text(hjust = 0.5, size = 15)) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) + 
  facet_wrap(~residual, scales = 'free')

p = length(coef(fitlog))
n = length(fitted(fitlog))
alpha <- 0.05
tCrit <- qt(p = 1 - alpha/(2 * n), n - p - 1)
tNoBon <- qt(p = 1 - alpha/2, n - p - 1)
tab <- tibble(jackknife = abs(fort.3$.jackknife[abs(fort.3$.jackknife) > 3]),
       tCrit = tCrit,
       OutlierBon = jackknife > tCrit,
       tNoBon = tNoBon,
       OutlierNoBon = jackknife > tNoBon)  %>%
  kbl(align = 'c') %>%
  kable_styling(full_width = F)
tab
jackknife tCrit OutlierBon tNoBon OutlierNoBon
3.340542 4.00521 FALSE 1.963264 TRUE
4.347921 4.00521 TRUE 1.963264 TRUE
4.860311 4.00521 TRUE 1.963264 TRUE
13.821477 4.00521 TRUE 1.963264 TRUE
3.643416 4.00521 FALSE 1.963264 TRUE
theoreticalT <- qt(p = 1 - 0.05/(2 * n), df = n - p - 1)
fort.3$obsN <- row.names(fort.3)
fort.3 %>%
  mutate(theoretical = theoreticalT,
         rejectNull = abs(.jackknife) > theoretical) %>%
  filter(rejectNull == T) %>%
  select(obsN, .jackknife, theoretical, rejectNull) %>%
  kbl(align = 'c') %>%
  kable_styling()
obsN .jackknife theoretical rejectNull
27 -4.347921 4.00521 TRUE
69 -4.860311 4.00521 TRUE
668 -13.821477 4.00521 TRUE

This shows that points 27, 69 and 668 are outliears.

fort.3 %>%
  ggplot(aes(x = rn, y = .hat)) +
  geom_point(size = 1) +
  geom_hline(yintercept = (2*p)/n, lty = 2, col = 'red') +
  geom_text(aes(label = ifelse(.hat > 2*p/n, rn, '')), hjust = -0.5) +
  xlab("Index") +
  ylab("Leverages") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  ggtitle("Index plot of leverages") +
  theme(plot.title = element_text(hjust = 0.5, size = 15))

suspicious <- c(27,50,69,668,150,105,204,17,185,315,248,283,381,367,360,416, 463, 472,52,514,551,52,612,647,682,725,692 ,293,694,627)
fort.3 %>%
  filter(obsN %in% suspicious) %>%
  mutate(highLeverage = .hat > 2*p/n,
         outlier = abs(.jackknife) > theoreticalT,
         influential = .cooksd > 0.05) %>%
  select(obsN, highLeverage, outlier, influential) %>%
  mutate(totalMarks = highLeverage + outlier + influential) %>%
  kbl(align = 'c') %>%
  kable_styling()
obsN highLeverage outlier influential totalMarks
17 TRUE FALSE FALSE 1
27 FALSE TRUE FALSE 1
50 TRUE FALSE FALSE 1
52 TRUE FALSE FALSE 1
69 TRUE TRUE TRUE 3
105 TRUE FALSE FALSE 1
150 TRUE FALSE FALSE 1
185 TRUE FALSE FALSE 1
204 TRUE FALSE FALSE 1
248 TRUE FALSE FALSE 1
283 TRUE FALSE FALSE 1
293 FALSE FALSE FALSE 0
315 TRUE FALSE FALSE 1
360 TRUE FALSE FALSE 1
367 TRUE FALSE FALSE 1
381 TRUE FALSE FALSE 1
416 TRUE FALSE FALSE 1
463 TRUE FALSE FALSE 1
472 TRUE FALSE FALSE 1
514 TRUE FALSE FALSE 1
551 TRUE FALSE FALSE 1
612 TRUE FALSE FALSE 1
627 FALSE FALSE FALSE 0
647 TRUE FALSE FALSE 1
668 FALSE TRUE TRUE 2
682 TRUE FALSE FALSE 1
692 TRUE FALSE FALSE 1
694 FALSE FALSE FALSE 0
725 TRUE FALSE FALSE 1

We can see that 27, 688 and 69 are outliear and influential.

  1. Assess the assumptions of the model from (c) by looking at the residual plots, the fitted-versus-residuals plot, and the QQ-plot. Discuss what you see.
fort.3 %>%
  dplyr::select(.resid, rn) %>%
  gather(residual, value, -rn) %>%
  mutate(residual = factor(residual, 
                           levels = '.resid')) %>%
  ggplot(aes(x = rn, y = value)) + 
  geom_point(size = 1) +
  geom_hline(yintercept = 0, lty = 2, col = 'red') +
  geom_hline(yintercept = 2, lty = 2, col = 'red') +
  geom_hline(yintercept = -2, lty = 2, col = 'red') +
  geom_text(aes(label = ifelse(abs(value) >2, rn, '')), vjust = -0.5) +
  xlab("Index") + 
  ylab("Residuals") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) + 
  ggtitle("Index plot of residuals") +
  theme(plot.title = element_text(hjust = 0.5, size = 15))

tibble(fort.3$.stdresid) %>%
  gather(type, val) %>%
  ggplot(aes(sample = val)) +
  stat_qq(size = 1) +
  geom_abline(slope = 1, intercept = 0, lty = 2, col = 'blue') +
  xlab("Theoretical value") + 
  ylab("Sample value") +
  ggtitle("QQ-plot of the residuals") + 
  theme(plot.title = element_text(hjust = 0.5, size = 15))

It is clear that there are some points which are not normal. However, it seams that residuals are not normally distributed and in the residual-index plot dots are not equally distribute.

  1. Plot the partial regression plot for each predictor you treat as a continuous variablein your modeling. Discuss what you see.
car::avPlots(fitlog, id.n=2, id.cex=0.7)

easily we can see that points 69 and 688 are outliear of the each plot.

  1. Based on the results from items (e), (f), (g), and (h) suggest improvements to themodel.

we can remove the outliers (Not at the same time ) one by one to fit the model to find better fit.

Any way I would like to do see the model with boxcox transformation.

fit.bc=lm(T_lambda_registered~season+temp+hum+windspeed+workingday+holiday, data = bikesharing)
fort.4 <- fortify(fit.bc)
fort.4$.jackknife <- rstudent(fit.bc)
fort.4$rn <- row.names(fort.4)
fort.4 %>%
  dplyr::select(.resid, rn) %>%
  gather(residual, value, -rn) %>%
  mutate(residual = factor(residual, 
                           levels = '.resid')) %>%
  ggplot(aes(x = rn, y = value)) + 
  geom_point(size = 1) +
  geom_hline(yintercept = 0, lty = 2, col = 'red') +
  geom_hline(yintercept = 250, lty = 2, col = 'red') +
  geom_hline(yintercept = -250, lty = 2, col = 'red') +
  geom_text(aes(label = ifelse(abs(value) >250, rn, '')), vjust = -0.5) +
  xlab("Index") + 
  ylab("Residuals") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) + 
  ggtitle("Index plot of residuals") +
  theme(plot.title = element_text(hjust = 0.5, size = 15))

tibble(fort.4$.stdresid) %>%
  gather(type, val) %>%
  ggplot(aes(sample = val)) +
  stat_qq(size = 1) +
  geom_abline(slope = 1, intercept = 0, lty = 2, col = 'blue') +
  xlab("Theoretical value") + 
  ylab("Sample value") +
  ggtitle("QQ-plot of the residuals") + 
  theme(plot.title = element_text(hjust = 0.5, size = 15))

Logistic regression

For this problem we will use theGusto.txt data. The data consists of 2188 observations and 26 columns. The variable of interest is DAY30 which represents the 30-day survival of patients who suffered myocardial infarction (heart attack). If DAY30 = 0, the patient survived and if DAY30 = 1 the patient died. We are interested in determining which risk factors are associated with the survival.

We are going to limit our analysis to the following variables:AGE: age (years);SEX: 0 = male; 1 = female; DIA: 0 = no diabetes; 1 = diabetes; HTN: 0 =no hypertension; 1 = hypertension; SMK: 1 = current smoker; 2 = formersmoker; 3 = never smoked; PMI: 0 = no previous myocardial infarction; 1 =previous myocardial infarction, HEI: height (cm); WEI: weight (kg).

  1. Load the data into R and reduce it so it only has the variables listed above. Recast variable SMK as a factor variable with baseline1 = current smoker. Replace the variables HEI and WEI with the variable BMI defined as weight(kg)/height(m)2. Per-form an exploratory analysis on the data by determining the number of observations for each level of DAY30 and by computing the mean, standard deviation, 25% and 75% quantiles for each continuous variable stratified by the levels of DAY30. Present your results in a table and discuss the table.
library(readr)
GustoW <- read_table2("/Users/farhadzare/Desktop/Applied linear reg/GustoW.txt")
GustoW<-GustoW[,c(1,2,3,7,11,12,13,14,15)]
GustoW$SMK=factor(GustoW$SMK, labels = c('current','former','never'))
is.factor(GustoW$SMK)
## [1] TRUE
GustoW$BMI=GustoW$WEI/(GustoW$HEI^2)

drop <- c("HEI","WEI")
GustoW_update = GustoW[,!(names(GustoW) %in% drop)]

GustoW_update[,-7] %>%
  gather(variable, value, -DAY30) %>%
  group_by(DAY30, variable) %>%
  summarize(n = n(),
            mean = mean(value),
            sd = sd(value),
            min = min(value),
            q25 = quantile(value, 0.25),
            median = median(value),
            q75 = quantile(value, 0.75),
            max = max(value)) %>%
  arrange(variable, DAY30) %>%
  kbl(align = 'c', booktabs = T) %>%
  kable_styling(full_width = F)
DAY30 variable n mean sd min q25 median q75 max
0 AGE 2053 59.7516254 11.7210154 23.9100000 50.5160000 59.8750000 69.0470000 88.8440000
1 AGE 135 71.3814296 11.3569271 31.4220000 65.1250000 73.0160000 79.3205000 89.4840000
0 BMI 2053 0.0028006 0.0005132 0.0014255 0.0024580 0.0027303 0.0030577 0.0058125
1 BMI 135 0.0026160 0.0005332 0.0016764 0.0022826 0.0025489 0.0028028 0.0049867
0 DIA 2053 0.1402825 0.3473645 0.0000000 0.0000000 0.0000000 0.0000000 1.0000000
1 DIA 135 0.1777778 0.3837495 0.0000000 0.0000000 0.0000000 0.0000000 1.0000000
0 HTN 2053 0.4018509 0.4903916 0.0000000 0.0000000 0.0000000 1.0000000 1.0000000
1 HTN 135 0.4296296 0.4968669 0.0000000 0.0000000 0.0000000 1.0000000 1.0000000
0 PMI 2053 0.1626887 0.3691714 0.0000000 0.0000000 0.0000000 0.0000000 1.0000000
1 PMI 135 0.3037037 0.4615689 0.0000000 0.0000000 0.0000000 1.0000000 1.0000000
0 SEX 2053 0.2367267 0.4251767 0.0000000 0.0000000 0.0000000 0.0000000 1.0000000
1 SEX 135 0.4296296 0.4968669 0.0000000 0.0000000 0.0000000 1.0000000 1.0000000
  1. Fit a logistic regression model to the data which includes all the variables as predictors (be careful! We have replaced and removed some variables). State the estimates for the parameters. Plot the ROC curve of the model. What do the x and y axis on the ROC curve plot represent? Compute the AUC of the ROC curve, and compute the Brier score of the model where the Brier score is defined as:\[B:=\frac{1}{n}\sum_{i=1}^n(yi−\hat{\pi}_i)2\],where \(\hat{\pi}_i\) are the predicted probabilities of the model.
GustoW_update %>%
  group_by(DAY30, SMK) %>%
  count() %>%
  ggplot(aes(x = SMK, y = n, fill = factor(DAY30))) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_fill_brewer(type = 'seq', palette = 'Set1') +
  theme(legend.position = 'bottom')

glm1 <- glm( DAY30~ ., data = GustoW_update, family = binomial)
glm1S=summary(glm1)
glm1S
## 
## Call:
## glm(formula = DAY30 ~ ., family = binomial, data = GustoW_update)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0961  -0.3865  -0.2450  -0.1497   3.4711  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -7.90120    1.00686  -7.847 4.25e-15 ***
## SEX            0.47193    0.20304   2.324 0.020106 *  
## AGE            0.08948    0.01089   8.215  < 2e-16 ***
## DIA            0.31345    0.25287   1.240 0.215145    
## PMI            0.73394    0.20891   3.513 0.000443 ***
## HTN           -0.16735    0.19513  -0.858 0.391098    
## SMKformer     -0.19232    0.25526  -0.753 0.451196    
## SMKnever      -0.15030    0.24984  -0.602 0.547453    
## BMI         -336.98688  216.19819  -1.559 0.119069    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1013.57  on 2187  degrees of freedom
## Residual deviance:  862.68  on 2179  degrees of freedom
## AIC: 880.68
## 
## Number of Fisher Scoring iterations: 6
library(pROC)
Roc0=roc(GustoW_update$DAY30, predict(glm1))

plot(Roc0,print.auc=TRUE)

AUC=auc(roc(GustoW_update$DAY30, predict(glm1)))
AUC
## Area under the curve: 0.7836
BRIER=mean((GustoW_update$DAY30 - predict(glm1, type = 'response'))^2)
BRIER
## [1] 0.05187348

y-axes shows true positive rate x-axes shows false positive rate.

  1. Use the method of backward selection to eliminate predictors until all coefficient p-values are less than or equal to \(\alpha'\)= 0.2. Keep track of the AIC, AUC, and Brier score of the model for each iteration of the elimination process, as well as which predictor was eliminated. Present the results in a table with columns Iteration,Predictor eliminated, AIC, AUC, Brier. The 0-th iteration is the model from(b) and it should be included in the table (predictor elimanted cell is empty here).Once you have reached the stopping criteria, state the estimates of the model on the log-odds scale along with their p-values and plot the ROC curve of the model.
glm1S$coefficients[glm1S$coefficients[,4]>.2,]
##             Estimate Std. Error    z value  Pr(>|z|)
## DIA        0.3134467  0.2528728  1.2395428 0.2151445
## HTN       -0.1673493  0.1951305 -0.8576272 0.3910983
## SMKformer -0.1923167  0.2552573 -0.7534231 0.4511957
## SMKnever  -0.1503009  0.2498433 -0.6015807 0.5474533
AICData <- data.frame(Iteration = 1:4,predictors_Eliminate=0,
                      AIC = 0, AUC=0, Brier=0) 


AICData[1, 2]<-'NA'
AICData[1, 3] <- AIC(glm1)
AICData[1, 4] <- auc(roc(GustoW_update$DAY30, predict(glm1)))
AICData[1, 5] <- mean((GustoW_update$DAY30 - predict(glm1, type = 'response'))^2)


lmTmp <- update(glm1 ,.~.-SMK )
lmTmpS=summary(lmTmp)
AICData[2, 2]<-'SMK'
AICData[2, 3] <- AIC(lmTmp)
AICData[2, 4] <- auc(roc(GustoW_update$DAY30, predict(lmTmp)))
AICData[2, 5] <- mean((GustoW_update$DAY30 - predict(lmTmp, type = 'response'))^2)

lmTmp <- update(glm1 ,.~.-HTN-SMK)
lmTmpS=summary(lmTmp)
AICData[3, 2]<-'HTN+SMK'
AICData[3, 3] <- AIC(lmTmp)
AICData[3, 4] <- auc(roc(GustoW_update$DAY30, predict(lmTmp)))
AICData[3, 5] <- mean((GustoW_update$DAY30 - predict(lmTmp, type = 'response'))^2)

lmTmp <- update(glm1 ,.~.-HTN-DIA-SMK)
lmTmpS=summary(lmTmp)
AICData[4, 2]<-'HTN+SMK+DIA'
AICData[4, 3] <- AIC(lmTmp)
AICData[4, 4] <- auc(roc(GustoW_update$DAY30, predict(lmTmp)))
AICData[4, 5] <- mean((GustoW_update$DAY30 - predict(lmTmp, type = 'response'))^2)
AICData %>%
  kbl(align = 'c') %>%
  kable_styling()
Iteration predictors_Eliminate AIC AUC Brier
1 NA 880.6832 0.7835507 0.0518735
2 SMK 877.2835 0.7832945 0.0518943
3 HTN+SMK 876.0971 0.7825693 0.0519763
4 HTN+SMK+DIA 875.3988 0.7810792 0.0519730

Checking :

library (leaps)
back=step(glm1) 
## Start:  AIC=880.68
## DAY30 ~ SEX + AGE + DIA + PMI + HTN + SMK + BMI
## 
##        Df Deviance    AIC
## - SMK   2   863.28 877.28
## - HTN   1   863.42 879.42
## - DIA   1   864.15 880.15
## <none>      862.68 880.68
## - BMI   1   865.23 881.23
## - SEX   1   867.99 883.99
## - PMI   1   874.18 890.18
## - AGE   1   942.70 958.70
## 
## Step:  AIC=877.28
## DAY30 ~ SEX + AGE + DIA + PMI + HTN + BMI
## 
##        Df Deviance    AIC
## - HTN   1   864.10 876.10
## - DIA   1   864.69 876.69
## <none>      863.28 877.28
## - BMI   1   866.26 878.26
## - SEX   1   869.56 881.56
## - PMI   1   874.83 886.83
## - AGE   1   953.48 965.48
## 
## Step:  AIC=876.1
## DAY30 ~ SEX + AGE + DIA + PMI + BMI
## 
##        Df Deviance    AIC
## - DIA   1   865.40 875.40
## <none>      864.10 876.10
## - BMI   1   867.80 877.80
## - SEX   1   869.86 879.86
## - PMI   1   875.59 885.59
## - AGE   1   953.59 963.59
## 
## Step:  AIC=875.4
## DAY30 ~ SEX + AGE + PMI + BMI
## 
##        Df Deviance    AIC
## <none>      865.40 875.40
## - BMI   1   868.34 876.34
## - SEX   1   871.27 879.27
## - PMI   1   877.25 885.25
## - AGE   1   955.93 963.93
stepWiseLM <- stepAIC(object = glm1, direction = 'backward', trace = F)
summary(stepWiseLM)
## 
## Call:
## glm(formula = DAY30 ~ SEX + AGE + PMI + BMI, family = binomial, 
##     data = GustoW_update)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1541  -0.3903  -0.2418  -0.1526   3.4729  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.769e+00  9.736e-01  -7.979 1.47e-15 ***
## SEX          4.756e-01  1.941e-01   2.450 0.014281 *  
## AGE          8.575e-02  9.945e-03   8.622  < 2e-16 ***
## PMI          7.440e-01  2.084e-01   3.570 0.000357 ***
## BMI         -3.448e+02  2.061e+02  -1.673 0.094326 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1013.6  on 2187  degrees of freedom
## Residual deviance:  865.4  on 2183  degrees of freedom
## AIC: 875.4
## 
## Number of Fisher Scoring iterations: 6
Roc1=roc(GustoW_update$DAY30, predict(stepWiseLM ))
plot(Roc1,print.auc=TRUE)

  1. Plot the deviance residuals, the deviance residuals versus the fitted values, and the deviance residuals versus all predictors (one graph for each predictor). Plot the QQplot of the deviance residuals. Are the deviance residuals normally distributed?
fort.final <- fortify(stepWiseLM)
fort.final$.jackknife <- rstudent(stepWiseLM)
fort.final$rn <- row.names(fort.final)
stepWiseLMS=summary(stepWiseLM)
devRes <- stepWiseLMS$deviance.resid
data.frame(y = GustoW_update$DAY30,
           index = 1:nrow(GustoW_update),
           deviance = devRes) %>%
  ggplot(aes(x = index, y = deviance, col = y)) +
  geom_point() +
  geom_hline(yintercept = -3, lty = 2, col = 'red') +
  geom_hline(yintercept = 3, lty = 2, col = 'red') +
  geom_hline(yintercept = 0, lty = 2) +
  geom_text(aes(label = ifelse(abs(deviance) > 3, index, '')), vjust = -0.5)

GustoW_update %>%
  mutate(deviance = devRes,
         index = row_number()) %>%
  gather(variable, value, -deviance, -DAY30, -index) %>%
  ggplot(aes(x  = value, y =  deviance, color = DAY30)) +
  geom_point() +
  facet_wrap(~variable, scales = 'free') +
  geom_hline(yintercept = -2, lty = 2, col = 'red') +
  geom_hline(yintercept = 2, lty = 2, col = 'red') +
  geom_hline(yintercept = 0, lty = 2) +
  geom_text(aes(label = ifelse(abs(deviance) > 2, index, '')), vjust = -0.5) +
  stat_smooth(method="loess",se=F)

data.frame(index = 1:nrow(GustoW_update),
           y = GustoW_update$DAY30,
           predict = stepWiseLM$fitted.values, # can also use predict with type = response
           deviance = devRes) %>% 
  ggplot(aes(x = predict, y = deviance, color = y)) +
  geom_point() +
  geom_hline(yintercept = -2, lty = 2, col = 'red') +
  geom_hline(yintercept = 2, lty = 2, col = 'red') +
  geom_hline(yintercept = 0, lty = 2) +
  geom_text(aes(label = ifelse(abs(deviance) > 2.5, index, '')), vjust = -0.5) +
  stat_smooth(method="loess", se = F) # I choose 2.5 to remove number above some points

qqnorm(devRes)

(f) Determine the predicted probability of an individual with the following predictor values:AGE = 67; SEX = 0; DIA = 0; HTN = 0; SMK = 3; PMI = 1; HEI =187; WEI = 115. Compute the 95% confidence interval of the predicted probability for this individual. State the predicted probability and the confidence interval.

stepWiseLM$coefficients
##  (Intercept)          SEX          AGE          PMI          BMI 
##   -7.7687705    0.4756117    0.0857483    0.7439695 -344.7912561
x0 <- c(1,0,67,1,115/(187^2))
S <- vcov(stepWiseLM)
S
##               (Intercept)           SEX           AGE           PMI
## (Intercept)  9.479684e-01  0.0067995017 -7.997535e-03 -1.666891e-04
## SEX          6.799502e-03  0.0376816926 -3.607839e-04  4.241197e-03
## AGE         -7.997535e-03 -0.0003607839  9.891219e-05 -9.632451e-05
## PMI         -1.666891e-04  0.0042411975 -9.632451e-05  4.343084e-02
## BMI         -1.437937e+02  0.8054436177  4.581698e-01 -2.646149e+00
##                       BMI
## (Intercept)  -143.7936512
## SEX             0.8054436
## AGE             0.4581698
## PMI            -2.6461493
## BMI         42473.4701493
eta <- t(x0) %*% coef(stepWiseLM)
se <- qnorm(0.975) * sqrt(t(x0) %*% S %*% x0)
eta
##           [,1]
## [1,] -2.413555
c(eta - se, eta + se)
## [1] -2.84560 -1.98151
# probability scale
exp(eta)/(1 + exp(eta))
##            [,1]
## [1,] 0.08214488
# The 95% confidence interval is as follows:
exp(c(eta - se, eta + se))/(1 + exp(c(eta - se, eta + se)))
## [1] 0.05490921 0.12115794